Risk Deep Dive: Detailed risk analysis¶
This notebook is built to work with the repo Make targets (e.g.make report-both YEARS="YYYY YYYY..." MONTHS="1 2 .. 12")
It always prefers the current run (your summaries/<RUN_TAG>/ folder) when you pass YEARS/MONTHS, so the report reflects exactly what you ingested.
How to run (recommended)¶
From the repo root:
make all-both YEARS="YYYY YYYY..." MONTHS="1 2 .. 12"
The Makefile sets environment variables (e.g. `CITIBIKE_PARQUET_DIR`, `CITIBIKE_YEARS`, `CITIBIKE_MONTHS`) which this notebook reads.
In [1]:
# --- Setup (STRICT): load summaries + FORCE year/month risk tables (NO overall fallback) ---
from __future__ import annotations
from pathlib import Path
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
# Notebook-friendly display
try:
from IPython.display import display, Markdown
except Exception:
display = print
Markdown = lambda x: x
# Ensure figures render in executed notebook
try:
get_ipython().run_line_magic("matplotlib", "inline")
except Exception:
pass
plt.ioff() # nbconvert-friendly
# ---------------------------
# Repo discovery
# ---------------------------
def find_repo_root(start: Path) -> Path:
start = start.resolve()
for p in [start, *start.parents]:
if (p / "Makefile").exists():
return p
raise FileNotFoundError(f"Could not find repo root (Makefile) from CWD={Path.cwd().resolve()}")
REPO_ROOT = find_repo_root(Path.cwd())
SUMMARIES_ROOT = REPO_ROOT / "summaries"
# ---------------------------
# Helpers
# ---------------------------
def _parse_int_list(val: str | None):
if val is None:
return None
s = str(val).strip()
if not s:
return None
parts = re.split(r"[,\s]+", s)
out = []
for p in parts:
if not p:
continue
try:
out.append(int(p))
except Exception:
pass
return out or None
def read_csv_strict(path: Path) -> pd.DataFrame:
if not path.exists():
raise FileNotFoundError(f"Missing required CSV: {path}")
return pd.read_csv(path)
def read_csv_optional(path: Path) -> pd.DataFrame | None:
return pd.read_csv(path) if path.exists() else None
def _filter_year_month(df: pd.DataFrame, years: list[int] | None, months: list[int] | None) -> pd.DataFrame:
out = df.copy()
if years is not None and "year" in out.columns:
out["year"] = pd.to_numeric(out["year"], errors="coerce")
out = out[out["year"].isin(years)]
if months is not None and "month" in out.columns:
out["month"] = pd.to_numeric(out["month"], errors="coerce")
out = out[out["month"].isin(months)]
return out
# ---------------------------
# Inputs from Makefile
# ---------------------------
PARQUET_DIR_ENV = (os.environ.get("CITIBIKE_PARQUET_DIR") or "").strip()
RUN_DIR_ENV = (os.environ.get("CITIBIKE_RUN_DIR") or "").strip()
MODE_ENV = (os.environ.get("CITIBIKE_MODE") or os.environ.get("MODE") or "").strip().lower()
YEARS_FILTER = _parse_int_list(os.environ.get("CITIBIKE_YEARS") or os.environ.get("YEARS"))
MONTHS_FILTER = _parse_int_list(os.environ.get("CITIBIKE_MONTHS") or os.environ.get("MONTHS"))
PARQUET_DIR = Path(PARQUET_DIR_ENV) if PARQUET_DIR_ENV else Path()
if RUN_DIR_ENV:
RUN_DIR = Path(RUN_DIR_ENV)
else:
run_tag = PARQUET_DIR.name if str(PARQUET_DIR).strip() else ""
RUN_DIR = (SUMMARIES_ROOT / run_tag) if run_tag else Path()
# Resolve relative -> absolute
if str(RUN_DIR).strip() and not RUN_DIR.is_absolute():
RUN_DIR = (REPO_ROOT / RUN_DIR).resolve()
if str(PARQUET_DIR).strip() and not PARQUET_DIR.is_absolute():
PARQUET_DIR = (REPO_ROOT / PARQUET_DIR).resolve()
# ---------------------------
# Strict checks
# ---------------------------
if not SUMMARIES_ROOT.exists():
raise FileNotFoundError(f"Expected summaries/ folder at: {SUMMARIES_ROOT}")
if not RUN_DIR.exists():
raise FileNotFoundError(f"Expected run summaries at: {RUN_DIR}")
print("REPO_ROOT:", REPO_ROOT)
print("RUN_DIR:", RUN_DIR)
print("PARQUET_DIR:", PARQUET_DIR if str(PARQUET_DIR).strip() else "(not set)")
print("MODE (env):", MODE_ENV or "(not set)")
print("YEARS_FILTER:", YEARS_FILTER, "MONTHS_FILTER:", MONTHS_FILTER)
# Figures dir
FIG_DIR = REPO_ROOT / "reports" / RUN_DIR.name / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)
print("FIG_DIR:", FIG_DIR)
def savefig(filename: str):
out = FIG_DIR / filename
plt.savefig(out, dpi=200, bbox_inches="tight")
print("Saved:", out)
# ---------------------------
# Load core per-run summaries (required)
# ---------------------------
df_year = read_csv_strict(RUN_DIR / "citibike_trips_by_year.csv")
df_month = read_csv_strict(RUN_DIR / "citibike_trips_by_month.csv")
df_dow = read_csv_strict(RUN_DIR / "citibike_trips_by_dow.csv")
df_hour = read_csv_strict(RUN_DIR / "citibike_trips_by_hour.csv")
# Optional outputs
df_station = read_csv_optional(RUN_DIR / "citibike_station_exposure.csv")
# ---------------------------
# Load RISK (FORCE granular)
# ---------------------------
risk_year_path = RUN_DIR / "station_risk_exposure_plus_crashproximity_by_year.csv"
risk_ym_path = RUN_DIR / "station_risk_exposure_plus_crashproximity_by_year_month.csv"
df_risk_year = read_csv_optional(risk_year_path)
df_risk_ym = read_csv_optional(risk_ym_path)
# Force granular selection (no overall fallback)
if df_risk_ym is not None:
df_risk = df_risk_ym
risk_source = "by_year_month"
elif df_risk_year is not None:
df_risk = df_risk_year
risk_source = "by_year"
else:
df_risk = None
risk_source = "missing"
raise FileNotFoundError(
"No per-year/per-month risk CSVs found in RUN_DIR.\n"
f"Expected one of:\n - {risk_ym_path}\n - {risk_year_path}\n"
"If you truly want overall-only risk, load station_risk_exposure_plus_crashproximity.csv explicitly (but you said you don't)."
)
# Highlights
highlights_path = RUN_DIR / "summary_highlights.md"
# Mode detection
mode = (
str(df_year["mode"].iloc[0]).lower()
if ("mode" in df_year.columns and len(df_year))
else (MODE_ENV or "unknown")
)
print("Detected mode:", mode)
# Apply filters defensively
df_year = _filter_year_month(df_year, YEARS_FILTER, MONTHS_FILTER)
df_month = _filter_year_month(df_month, YEARS_FILTER, MONTHS_FILTER)
df_dow = _filter_year_month(df_dow, YEARS_FILTER, MONTHS_FILTER)
df_hour = _filter_year_month(df_hour, YEARS_FILTER, MONTHS_FILTER)
df_risk = _filter_year_month(df_risk, YEARS_FILTER, MONTHS_FILTER)
run_label = RUN_DIR.name
print("\nRisk files found:")
print(" - by_year:", "YES" if df_risk_year is not None else "NO")
print(" - by_year_month:", "YES" if df_risk_ym is not None else "NO")
print("Using df_risk =", risk_source)
print("df_risk columns:", list(df_risk.columns))
print("Unique years in df_risk:", sorted(pd.to_numeric(df_risk["year"], errors="coerce").dropna().unique().astype(int).tolist()) if "year" in df_risk.columns else "(no year)")
print("Unique months in df_risk:", sorted(pd.to_numeric(df_risk["month"], errors="coerce").dropna().unique().astype(int).tolist()) if "month" in df_risk.columns else "(no month)")
REPO_ROOT: /home/hamzeh-khanpour/Desktop/CITIBIK-main RUN_DIR: /home/hamzeh-khanpour/Desktop/CITIBIK-main/summaries/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc PARQUET_DIR: /home/hamzeh-khanpour/Desktop/CITIBIK-main/data/processed/citibike_parquet/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc MODE (env): nyc YEARS_FILTER: [2019, 2020, 2023, 2025] MONTHS_FILTER: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] FIG_DIR: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures Detected mode: nyc Risk files found: - by_year: YES - by_year_month: YES Using df_risk = by_year_month df_risk columns: ['mode', 'start_station_id', 'start_station_name', 'year', 'month', 'trips', 'start_trips', 'end_trips', 'touchpoints', 'station_lat', 'station_lng', 'crashes_within_500m', 'crashes_within_500m_per_100k_trips', 'risk_proxy_available', 'data_quality'] Unique years in df_risk: [2019, 2020, 2023, 2025] Unique months in df_risk: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
Risk Distribution Overview¶
In [2]:
# --- HIGH-RISK STATIONS (nbconvert-safe, self-contained + year/month column on plot) ---
import re
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
MIN_TRIPS = 5000
TOP_N = 10
HIGH_RISK_PCT = 90 # top 10% by risk rate among credible rows
_SCORE_RE = re.compile(r"^axa_partner_scorecard_(\d+)m\.csv$")
def _available_scorecards(run_dir: Path) -> list[int]:
radii = []
for p in run_dir.glob("axa_partner_scorecard_*m.csv"):
m = _SCORE_RE.match(p.name)
if m:
radii.append(int(m.group(1)))
return sorted(set(radii))
def _pick_scorecard_path(run_dir: Path) -> Path | None:
radii = _available_scorecards(run_dir)
if not radii:
return None
# Stable default: prefer 500m if present; else max available
chosen = 500 if 500 in radii else max(radii)
p = run_dir / f"axa_partner_scorecard_{chosen}m.csv"
return p if p.exists() else None
def _safe_read_csv(p: Path) -> pd.DataFrame | None:
try:
return pd.read_csv(p) if p and p.exists() else None
except Exception as e:
print(f"ERROR reading {p}: {e}")
return None
# -----------------------------
# Ensure df_score exists (no NameError under nbconvert)
# -----------------------------
if "RUN_DIR" in globals() and RUN_DIR is not None:
run_dir = Path(RUN_DIR)
else:
run_dir = Path(".").resolve()
if "df_score" not in globals() or df_score is None or len(df_score) == 0:
score_path = _pick_scorecard_path(run_dir)
df_score = _safe_read_csv(score_path) if score_path else None
if df_score is None or len(df_score) == 0:
print(" No scoring data found (df_score is missing/empty).")
print("RUN_DIR used:", run_dir)
else:
print("\n" + "=" * 70)
print("HIGH-RISK STATIONS ANALYSIS")
print("=" * 70)
s = df_score.copy()
# ---- Resolve column names robustly ----
id_col = "start_station_id" if "start_station_id" in s.columns else ("station_id" if "station_id" in s.columns else None)
name_col = "start_station_name" if "start_station_name" in s.columns else ("station_name" if "station_name" in s.columns else None)
lat_col = "station_lat" if "station_lat" in s.columns else ("lat" if "lat" in s.columns else None)
lng_col = "station_lng" if "station_lng" in s.columns else ("lng" if "lng" in s.columns else None)
exposure_col = "exposure_trips" if "exposure_trips" in s.columns else ("trips" if "trips" in s.columns else None)
rate_col = "eb_risk_rate_per_100k_trips" if "eb_risk_rate_per_100k_trips" in s.columns else (
"risk_rate_per_100k_trips" if "risk_rate_per_100k_trips" in s.columns else None
)
crash_col = "crash_count" if "crash_count" in s.columns else None
cred_col = "credibility_flag" if "credibility_flag" in s.columns else None
# Helpful context columns (optional)
has_year = "year" in s.columns
has_month = "month" in s.columns
missing_required = [k for k, v in {
"station_id": id_col,
"station_name": name_col,
"exposure_trips": exposure_col,
"risk_rate": rate_col,
}.items() if v is None]
if missing_required:
print("Cannot run high-risk analysis; missing columns:", missing_required)
print("Available columns:", list(s.columns))
else:
# ---- Clean numeric columns ----
s[exposure_col] = pd.to_numeric(s[exposure_col], errors="coerce")
s[rate_col] = pd.to_numeric(s[rate_col], errors="coerce")
if crash_col is not None:
s[crash_col] = pd.to_numeric(s[crash_col], errors="coerce")
if has_year:
s["year"] = pd.to_numeric(s["year"], errors="coerce").astype("Int64")
if has_month:
s["month"] = pd.to_numeric(s["month"], errors="coerce").astype("Int64")
# ---- Credible definition ----
if cred_col is not None:
credible = s[s[cred_col].astype(str).str.lower() == "credible"].copy()
else:
credible = s[s[exposure_col] >= MIN_TRIPS].copy()
print(f"Note: no credibility_flag column; using exposure_trips ≥ {MIN_TRIPS:,} as credible.")
credible = credible.dropna(subset=[exposure_col, rate_col]).copy()
credible = credible[credible[exposure_col] >= MIN_TRIPS].copy()
if len(credible) == 0:
print("No credible rows with usable risk data.")
else:
# ---- Define high risk threshold on the credible distribution ----
cutoff = float(np.nanpercentile(credible[rate_col].values, HIGH_RISK_PCT))
high_risk = credible[credible[rate_col] >= cutoff].copy()
print(f"High-risk threshold: {HIGH_RISK_PCT}th percentile of {rate_col} among credible rows")
print(f"Cutoff value: {cutoff:.2f}")
print(f"High-risk rows: {len(high_risk):,}")
print(f"Total exposure in high-risk rows: {high_risk[exposure_col].sum():,.0f} trips")
# ---- Show Top N by risk rate ----
cols_to_show = [c for c in [
"mode",
"year" if has_year else None,
"month" if has_month else None,
id_col, name_col,
lat_col, lng_col,
exposure_col,
crash_col,
rate_col,
] if c and c in high_risk.columns]
top_risk = (
high_risk.sort_values([rate_col, exposure_col], ascending=False)
.head(TOP_N)[cols_to_show]
.copy()
)
print(f"\nTop {TOP_N} Highest-Risk (credible) station-periods:")
display(top_risk)
# ---- Print run scope so the plot is defensible ----
if has_year and has_month:
scope = (
s.dropna(subset=["year", "month"])
.groupby(["year", "month"])
.size()
.reset_index(name="rows")
.sort_values(["year", "month"])
)
print("\nScorecard scope (rows by year-month):")
# nbconvert-safe print (display may not render)
print(scope.to_string(index=False))
# ---- Plot Top N with a Year/Month column on the right ----
try:
top_plot = top_risk.sort_values(rate_col, ascending=True).copy()
fig, ax = plt.subplots(figsize=(12, 6))
y = np.arange(len(top_plot))
vals = top_plot[rate_col].values
ax.barh(y, vals, edgecolor="black")
# Left labels (station name)
ax.set_yticks(y)
ax.set_yticklabels([str(n)[:40] for n in top_plot[name_col]], fontsize=9)
ax.set_xlabel("Risk rate per 100k trips", fontsize=11)
ax.set_title(f"Top {TOP_N} Highest-Risk Station-Periods (Credible)", fontsize=12, fontweight="bold")
ax.grid(axis="x", alpha=0.3)
# Create space for right-side column
x_max = float(np.nanmax(vals)) if len(vals) else 0.0
pad = 0.18 * x_max if x_max > 0 else 1.0
ax.set_xlim(0, x_max + pad)
# Right column positions
x_sep = x_max + pad * 0.08
x_year = x_max + pad * 0.35
x_month = x_max + pad * 0.70
# Separator line
ax.axvline(x_sep, color="gray", linewidth=1, alpha=0.6)
# Headers + values
if has_year and has_month:
ax.text(x_year, len(y) - 0.15, "Year", ha="center", va="bottom",
fontsize=10, fontweight="bold")
ax.text(x_month, len(y) - 0.15, "Month", ha="center", va="bottom",
fontsize=10, fontweight="bold")
for i, (_, row) in enumerate(top_plot.iterrows()):
yr = row.get("year", pd.NA)
mo = row.get("month", pd.NA)
yr_txt = "" if pd.isna(yr) else str(int(yr))
mo_txt = "" if pd.isna(mo) else str(int(mo)).zfill(2)
ax.text(x_year, i, yr_txt, ha="center", va="center", fontsize=9)
ax.text(x_month, i, mo_txt, ha="center", va="center", fontsize=9)
plt.tight_layout()
savefig("top10_highest_risk_station_periods.png")
plt.show()
print("✓ Saved plot: top10_highest_risk_station_periods.png")
except Exception as e:
print(f"Could not generate plot: {e}")
====================================================================== HIGH-RISK STATIONS ANALYSIS ====================================================================== High-risk threshold: 90th percentile of eb_risk_rate_per_100k_trips among credible rows Cutoff value: 628.72 High-risk rows: 1,646 Total exposure in high-risk rows: 12,567,625 trips Top 10 Highest-Risk (credible) station-periods:
| mode | year | month | start_station_id | start_station_name | station_lat | station_lng | exposure_trips | crash_count | eb_risk_rate_per_100k_trips | |
|---|---|---|---|---|---|---|---|---|---|---|
| 152 | nyc | 2019 | 6 | 3734 | E 58 St & 1 Ave (NW Corner) | 40.759125 | -73.962658 | 5008 | 116 | 2068.172197 |
| 150 | nyc | 2019 | 2 | 529.0 | W 42 St & 8 Ave | 40.757570 | -73.990985 | 5098 | 109 | 2035.034813 |
| 84 | nyc | 2019 | 5 | 3712 | W 35 St & Dyer Ave | 40.754692 | -73.997402 | 5486 | 120 | 1999.473641 |
| 184 | nyc | 2019 | 5 | 3749 | Lexington Ave & E 36 St | 40.747574 | -73.978801 | 5108 | 112 | 1992.237399 |
| 141 | nyc | 2019 | 2 | 401.0 | Allen St & Rivington St | 40.720196 | -73.989978 | 5270 | 109 | 1979.209296 |
| 75 | nyc | 2019 | 10 | 464 | E 56 St & 3 Ave | 40.759345 | -73.967597 | 5731 | 122 | 1934.592498 |
| 87 | nyc | 2019 | 12 | 529 | W 42 St & 8 Ave | 40.757570 | -73.990985 | 5672 | 112 | 1931.177105 |
| 237 | nyc | 2019 | 11 | 3236 | W 42 St & Dyer Ave | 40.758985 | -73.993800 | 5081 | 106 | 1926.525165 |
| 188 | nyc | 2019 | 1 | 529.0 | W 42 St & 8 Ave | 40.757570 | -73.990985 | 5319 | 105 | 1907.926813 |
| 203 | nyc | 2019 | 2 | 531.0 | Forsyth St & Broome St | 40.718939 | -73.992663 | 5339 | 104 | 1878.788813 |
Scorecard scope (rows by year-month): year month rows 2019 1 769 2019 2 767 2019 3 770 2019 4 786 2019 5 796 2019 6 795 2019 7 790 2019 8 795 2019 9 808 2019 10 840 2019 11 870 2019 12 881 2020 1 917 2020 2 908 2020 3 904 2020 4 900 2020 5 940 2020 6 978 2020 7 1008 2020 8 1055 2020 9 1106 2020 10 1162 2020 11 1166 2020 12 1189 2023 1 1817 2023 2 1872 2023 3 1898 2023 4 1912 2023 5 1910 2023 6 1923 2023 7 1964 2023 8 2014 2023 9 2060 2023 10 2180 2023 11 2103 2023 12 2202 2025 1 2252 2025 2 2244 2025 3 2247 2025 4 2247 2025 5 2245 2025 6 2207 2025 7 2206 2025 8 2198 2025 9 2213 2025 10 2203 2025 11 2222 2025 12 2227
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/top10_highest_risk_station_periods.png
✓ Saved plot: top10_highest_risk_station_periods.png
In [3]:
import os
import re
import math
import pandas as pd
import matplotlib.pyplot as plt
# -----------------------------
# Helpers: parse env selections
# -----------------------------
def _parse_int_list_env(name: str):
raw = os.environ.get(name, "").strip()
if not raw:
return None
parts = re.split(r"[,\s]+", raw)
out = []
for p in parts:
p = p.strip()
if not p:
continue
try:
out.append(int(p))
except ValueError:
pass
return out or None
MODE_ENV = (os.environ.get("CITIBIKE_MODE", "") or "unknown").strip().lower()
years_env = _parse_int_list_env("CITIBIKE_YEARS")
months_env = _parse_int_list_env("CITIBIKE_MONTHS")
# -----------------------------
# Column choices
# -----------------------------
exposure_col = "exposure_trips"
rate_col = "eb_risk_rate_per_100k_trips"
crash_col = "crash_count" if "crash_count" in df_score.columns else None
needed = [exposure_col, rate_col]
missing = [c for c in needed if c not in df_score.columns]
if missing:
raise ValueError(f"df_score missing required columns: {missing}")
plot_df = df_score.copy()
# Optional: only credible rows
if "credibility_flag" in plot_df.columns:
plot_df = plot_df[plot_df["credibility_flag"] == "credible"].copy()
# Clean types
plot_df[exposure_col] = pd.to_numeric(plot_df[exposure_col], errors="coerce")
plot_df[rate_col] = pd.to_numeric(plot_df[rate_col], errors="coerce")
if crash_col is not None:
plot_df[crash_col] = pd.to_numeric(plot_df[crash_col], errors="coerce").fillna(0)
plot_df = plot_df.dropna(subset=[exposure_col, rate_col])
plot_df = plot_df[plot_df[exposure_col] > 0].copy()
has_period_cols = ("year" in plot_df.columns) and ("month" in plot_df.columns)
# -----------------------------
# Styling helpers (matches your snippet)
# -----------------------------
def _draw_scatter(ax, df_slice: pd.DataFrame):
if crash_col is not None:
sc = ax.scatter(
df_slice[exposure_col],
df_slice[rate_col],
c=df_slice[crash_col].fillna(0),
alpha=0.6,
s=30, # a bit smaller for dense multi-panels
)
return sc
else:
ax.scatter(
df_slice[exposure_col],
df_slice[rate_col],
alpha=0.6,
s=30,
)
return None
def _mark_hotspots(ax, df_slice: pd.DataFrame):
if "prevention_hotspot" in df_slice.columns:
hotspots = df_slice[df_slice["prevention_hotspot"] == True]
if len(hotspots) > 0:
ax.scatter(
hotspots[exposure_col],
hotspots[rate_col],
color="red", s=130, marker="*",
edgecolors="black", linewidths=1,
label="Prevention Hotspots", zorder=5
)
ax.legend(loc="best", fontsize=7)
def _style_axes(ax, title: str):
ax.set_xlabel("Exposure (trips)", fontsize=9)
ax.set_ylabel("EB Risk Rate (per 100k trips)", fontsize=9)
ax.set_title(title, fontsize=10, fontweight="bold")
ax.set_xscale("log")
ax.grid(alpha=0.3)
# -----------------------------
# If no year/month: do NOT crash (keeps all-both working)
# -----------------------------
if not has_period_cols:
print(f"[panel] No year/month columns for MODE={MODE_ENV}. Making overall plot only.")
fig, ax = plt.subplots(figsize=(7.2, 4.8))
sc = _draw_scatter(ax, plot_df)
_mark_hotspots(ax, plot_df)
_style_axes(ax, f"Risk vs Exposure (overall) — mode={MODE_ENV}")
if sc is not None:
plt.colorbar(sc, ax=ax, label="Crash Count")
savefig(f"risk_vs_exposure_overall_mode{MODE_ENV}.png")
plt.show()
else:
# -----------------------------
# Prepare year/month selections (no hardcoding)
# -----------------------------
plot_df["year"] = pd.to_numeric(plot_df["year"], errors="coerce")
plot_df["month"] = pd.to_numeric(plot_df["month"], errors="coerce")
plot_df = plot_df.dropna(subset=["year", "month"]).copy()
plot_df["year"] = plot_df["year"].astype(int)
plot_df["month"] = plot_df["month"].astype(int)
years_available = sorted(plot_df["year"].unique())
months_available = sorted(plot_df["month"].unique())
years_to_plot = years_env if years_env is not None else years_available
months_to_plot = months_env if months_env is not None else months_available
years_to_plot = [y for y in years_to_plot if y in years_available]
months_to_plot = [m for m in months_to_plot if m in months_available]
if not years_to_plot or not months_to_plot:
print(f"[panel] No data after filtering for MODE={MODE_ENV}. years_env={years_env}, months_env={months_env}")
else:
print(f"[panel] MODE={MODE_ENV} years={years_to_plot} months={months_to_plot} rows={len(plot_df):,}")
# =========================================================
# PANEL A (paginated): one subplot per (year, month)
# =========================================================
pairs = [(y, m) for y in years_to_plot for m in months_to_plot]
total = len(pairs)
# Pagination settings (good for up to ~48+ plots)
PANELS_PER_PAGE = 12 # 12 => 3x4 (readable)
NCOLS = 3
NROWS = int(math.ceil(PANELS_PER_PAGE / NCOLS))
pages = int(math.ceil(total / PANELS_PER_PAGE))
print(f"[panelA] Total period panels={total} -> pages={pages} (PANELS_PER_PAGE={PANELS_PER_PAGE})")
# For a stable shared color scale across pages (so colors are comparable),
# compute global vmin/vmax once.
vmin = vmax = None
if crash_col is not None and len(plot_df) > 0:
vmin = float(plot_df[crash_col].min())
vmax = float(plot_df[crash_col].max())
for page_idx in range(pages):
start = page_idx * PANELS_PER_PAGE
end = min((page_idx + 1) * PANELS_PER_PAGE, total)
page_pairs = pairs[start:end]
figA, axesA = plt.subplots(
NROWS, NCOLS,
figsize=(4.6 * NCOLS, 3.6 * NROWS),
squeeze=False
)
scatter_for_cbar = None
for i, (y, m) in enumerate(page_pairs):
r, c = divmod(i, NCOLS)
ax = axesA[r][c]
d = plot_df[(plot_df["year"] == y) & (plot_df["month"] == m)].copy()
if crash_col is not None:
sc = ax.scatter(
d[exposure_col],
d[rate_col],
c=d[crash_col].fillna(0),
alpha=0.6,
s=30,
vmin=vmin,
vmax=vmax,
)
else:
sc = ax.scatter(
d[exposure_col],
d[rate_col],
alpha=0.6,
s=30,
)
_mark_hotspots(ax, d)
_style_axes(ax, f"{y}-{m:02d} (mode={MODE_ENV})")
if scatter_for_cbar is None and crash_col is not None:
scatter_for_cbar = sc
# Turn off unused axes on last page
for j in range(len(page_pairs), NROWS * NCOLS):
r, c = divmod(j, NCOLS)
axesA[r][c].axis("off")
figA.suptitle(
f"Risk vs Exposure — mode={MODE_ENV} (page {page_idx+1}/{pages})",
fontsize=13, fontweight="bold"
)
figA.tight_layout(rect=[0.0, 0.0, 1.0, 0.93])
# Shared colorbar that matches the axes block height (NOT the full figure)
if scatter_for_cbar is not None:
import matplotlib as mpl
sm = mpl.cm.ScalarMappable(norm=scatter_for_cbar.norm, cmap=scatter_for_cbar.cmap)
sm.set_array([])
visible_axes = [ax for ax in axesA.ravel() if ax.get_visible() and ax.has_data()]
cbar = figA.colorbar(sm, ax=visible_axes, fraction=0.035, pad=0.02)
cbar.set_label("Crash Count")
savefig(f"risk_vs_exposure_panel_periods_mode{MODE_ENV}_page{page_idx+1:02d}.png")
plt.show()
# =========================================================
# PANEL B: YEARLY COMPARISON (overlay years) — one subplot per month
# =========================================================
# This scales well: one subplot per month, overlay each year.
n = len(months_to_plot)
ncols = min(3, n) if n > 1 else 1
nrows = int(math.ceil(n / ncols))
figB, axesB = plt.subplots(
nrows, ncols,
figsize=(4.6 * ncols, 3.6 * nrows),
squeeze=False
)
# If too many years, auto-switch to endpoints to keep readable
if len(years_to_plot) > 3:
years_overlay = [min(years_to_plot), max(years_to_plot)]
overlay_tag = f"endpoints_{years_overlay[0]}_{years_overlay[1]}"
else:
years_overlay = list(years_to_plot)
overlay_tag = "allYears"
for i, m in enumerate(months_to_plot):
r, c = divmod(i, ncols)
ax = axesB[r][c]
any_data = False
for y in years_overlay:
d = plot_df[(plot_df["year"] == y) & (plot_df["month"] == m)].copy()
if d.empty:
continue
any_data = True
ax.scatter(
d[exposure_col],
d[rate_col],
alpha=0.5,
s=22,
label=str(y)
)
_mark_hotspots(ax, d)
_style_axes(ax, f"Year comparison — month {m:02d} (mode={MODE_ENV})")
if any_data:
ax.legend(title="Year", fontsize=8, title_fontsize=9, loc="best")
else:
ax.text(0.5, 0.5, "No data", transform=ax.transAxes, ha="center", va="center")
ax.set_axis_off()
# Turn off unused axes
for j in range(n, nrows * ncols):
r, c = divmod(j, ncols)
axesB[r][c].axis("off")
figB.suptitle(
f"Yearly comparison (overlay: {overlay_tag}) — mode={MODE_ENV}",
fontsize=13, fontweight="bold"
)
figB.tight_layout(rect=[0.0, 0.0, 1.0, 0.93])
savefig(f"risk_vs_exposure_panel_year_compare_mode{MODE_ENV}_{overlay_tag}.png")
plt.show()
[panel] MODE=nyc years=[2019, 2020, 2023, 2025] months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] rows=16,454 [panelA] Total period panels=48 -> pages=4 (PANELS_PER_PAGE=12)
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page01.png
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page02.png
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page03.png
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page04.png
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_year_compare_modenyc_endpoints_2019_2025.png
In [4]:
import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# -----------------------------
# Helpers: parse env selections
# -----------------------------
def _parse_int_list_env(name: str):
raw = os.environ.get(name, "").strip()
if not raw:
return None
parts = re.split(r"[,\s]+", raw)
out = []
for p in parts:
p = p.strip()
if not p:
continue
try:
out.append(int(p))
except ValueError:
pass
return out or None
MODE_ENV = (os.environ.get("CITIBIKE_MODE", "") or "unknown").strip().lower()
years_env = _parse_int_list_env("CITIBIKE_YEARS")
months_env = _parse_int_list_env("CITIBIKE_MONTHS")
# ========================================================================
# CHECK: Does df_score have year/month columns?
# ========================================================================
needed = ["year", "month"]
missing = [c for c in needed if c not in df_score.columns]
if missing:
print("="*80)
print(" SEASONALITY ANALYSIS SKIPPED")
print("="*80)
print(f"Reason: df_score missing required columns: {missing}")
print(f"Mode: {MODE_ENV}")
print("")
print("This is expected for JC mode (no crash data with temporal granularity).")
print("Seasonality analysis requires year/month breakdown, which only exists for NYC.")
print("")
print("Available columns in df_score:")
print(list(df_score.columns))
print("="*80)
else:
# ========================================================================
# PROCEED WITH SEASONALITY ANALYSIS
# ========================================================================
print(f" df_score has year/month columns → proceeding with seasonality analysis")
# -----------------------------
# What to plot (time-comparable)
# -----------------------------
# Use EB rate for time comparison (risk_index_pct is within-period and not ideal across time)
value_col = "eb_risk_rate_per_100k_trips"
if value_col not in df_score.columns:
if "risk_rate_per_100k_trips" in df_score.columns:
value_col = "risk_rate_per_100k_trips"
else:
raise ValueError(
"Need eb_risk_rate_per_100k_trips (or risk_rate_per_100k_trips) in df_score."
)
df = df_score.copy()
# Optional: credible only
if "credibility_flag" in df.columns:
df = df[df["credibility_flag"] == "credible"].copy()
# Clean
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df["month"] = pd.to_numeric(df["month"], errors="coerce")
df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
df = df.dropna(subset=["year", "month", value_col]).copy()
df["year"] = df["year"].astype(int)
df["month"] = df["month"].astype(int)
# Apply env filters (no hardcoding)
years_available = sorted(df["year"].unique())
months_available = sorted(df["month"].unique())
years_to_use = years_env if years_env is not None else years_available
months_to_use = months_env if months_env is not None else months_available
years_to_use = [y for y in years_to_use if y in years_available]
months_to_use = [m for m in months_to_use if m in months_available]
df = df[df["year"].isin(years_to_use) & df["month"].isin(months_to_use)].copy()
if df.empty:
raise ValueError(f"No rows after filtering. years={years_to_use} months={months_to_use} mode={MODE_ENV}")
print(f"[seasonality] mode={MODE_ENV} years={years_to_use} months={months_to_use} rows={len(df):,} value={value_col}")
# -------------------------------------------------------------------
# Summary stats per (year, month): median + IQR
# -------------------------------------------------------------------
g = df.groupby(["year", "month"])[value_col]
summary = g.quantile([0.25, 0.5, 0.75]).unstack(level=-1).reset_index()
summary.columns = ["year", "month", "q25", "q50", "q75"]
# -------------------------------------------------------------------
# Plot 1: Seasonal lines (median ± IQR) per year
# -------------------------------------------------------------------
fig1, ax1 = plt.subplots(figsize=(10, 5.5))
for y in years_to_use:
s = summary[summary["year"] == y].sort_values("month")
if s.empty:
continue
ax1.plot(s["month"], s["q50"], marker="o", linewidth=2, label=str(y))
ax1.fill_between(s["month"], s["q25"], s["q75"], alpha=0.15)
ax1.set_title(f"Seasonality of station risk by month (median ± IQR) — mode={MODE_ENV}", fontweight="bold")
ax1.set_xlabel("Month")
ax1.set_ylabel(f"{value_col}")
ax1.set_xticks(sorted(months_to_use))
ax1.grid(alpha=0.3)
ax1.legend(
title="Year",
ncol=min(6, max(1, len(years_to_use))),
fontsize=9,
title_fontsize=10
)
savefig(f"seasonality_lines_median_IQR_mode{MODE_ENV}.png")
plt.show()
# -------------------------------------------------------------------
# Plot 2: Heatmap (Year × Month) of median risk with readable labels
# -------------------------------------------------------------------
pivot = (
summary.pivot(index="year", columns="month", values="q50")
.reindex(index=years_to_use, columns=sorted(months_to_use))
)
fig2, ax2 = plt.subplots(figsize=(10, 3.0 + 0.5 * len(years_to_use)))
# More readable colormap + stable perception
im = ax2.imshow(pivot.values, aspect="auto", cmap="cividis")
ax2.set_title(f"Year × Month median station risk — mode={MODE_ENV}", fontweight="bold")
ax2.set_xlabel("Month")
ax2.set_ylabel("Year")
ax2.set_xticks(range(len(pivot.columns)))
ax2.set_xticklabels([f"{m:02d}" for m in pivot.columns])
ax2.set_yticks(range(len(pivot.index)))
ax2.set_yticklabels([str(y) for y in pivot.index])
cbar = fig2.colorbar(im, ax=ax2, fraction=0.03, pad=0.02)
cbar.set_label(f"Median {value_col}")
# Adaptive text color + subtle bbox so numbers are readable on dark cells
norm = im.norm
cmap = im.cmap
for i, y in enumerate(pivot.index):
for j, m in enumerate(pivot.columns):
val = pivot.loc[y, m]
if pd.isna(val):
continue
rgba = cmap(norm(val))
luminance = 0.299 * rgba[0] + 0.587 * rgba[1] + 0.114 * rgba[2]
txt_color = "black" if luminance > 0.6 else "white"
ax2.text(
j, i, f"{val:.0f}",
ha="center", va="center",
fontsize=9, fontweight="bold",
color=txt_color,
bbox=dict(
boxstyle="round,pad=0.15",
facecolor=("white" if txt_color == "black" else "black"),
alpha=0.20,
edgecolor="none"
)
)
fig2.tight_layout()
savefig(f"seasonality_heatmap_year_month_mode{MODE_ENV}.png")
plt.show()
df_score has year/month columns → proceeding with seasonality analysis [seasonality] mode=nyc years=[2019, 2020, 2023, 2025] months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] rows=16,454 value=eb_risk_rate_per_100k_trips
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/seasonality_lines_median_IQR_modenyc.png
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/seasonality_heatmap_year_month_modenyc.png
In [5]:
import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
def _parse_int_list_env(name: str):
raw = os.environ.get(name, "").strip()
if not raw:
return None
parts = re.split(r"[,\s]+", raw)
out = []
for p in parts:
p = p.strip()
if not p:
continue
try:
out.append(int(p))
except ValueError:
pass
return out or None
MODE_ENV = (os.environ.get("CITIBIKE_MODE", "") or "unknown").strip().lower()
years_env = _parse_int_list_env("CITIBIKE_YEARS")
months_env = _parse_int_list_env("CITIBIKE_MONTHS")
rate_col = "eb_risk_rate_per_100k_trips"
exp_col = "exposure_trips"
id_col = "start_station_id" if "start_station_id" in df_score.columns else None
needed = [rate_col, exp_col]
missing = [c for c in needed if c not in df_score.columns]
if missing:
raise ValueError(f"df_score missing required columns: {missing}")
df = df_score.copy()
# Optional: credible only (keeps distributions stable)
if "credibility_flag" in df.columns:
df = df[df["credibility_flag"] == "credible"].copy()
# Clean
df[rate_col] = pd.to_numeric(df[rate_col], errors="coerce")
df[exp_col] = pd.to_numeric(df[exp_col], errors="coerce")
df = df.dropna(subset=[rate_col, exp_col])
df = df[df[exp_col] > 0].copy()
has_period_cols = ("year" in df.columns) and ("month" in df.columns)
if not has_period_cols:
print(f"[yearly-plots] No year/month columns for MODE={MODE_ENV}. Skipping yearly comparison plots.")
else:
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df["month"] = pd.to_numeric(df["month"], errors="coerce")
df = df.dropna(subset=["year", "month"]).copy()
df["year"] = df["year"].astype(int)
df["month"] = df["month"].astype(int)
years_available = sorted(df["year"].unique())
months_available = sorted(df["month"].unique())
years_to_use = years_env if years_env is not None else years_available
months_to_use = months_env if months_env is not None else months_available
years_to_use = [y for y in years_to_use if y in years_available]
months_to_use = [m for m in months_to_use if m in months_available]
# Filter to chosen months/years
df = df[df["year"].isin(years_to_use) & df["month"].isin(months_to_use)].copy()
if df.empty:
print(f"[yearly-plots] Empty after filtering. years={years_to_use} months={months_to_use} MODE={MODE_ENV}")
else:
print(f"[yearly-plots] MODE={MODE_ENV} years={years_to_use} months={months_to_use} rows={len(df):,}")
# =========================================================
# Plot A: Distribution by year (boxplot of station-period EB rates)
# =========================================================
figA, axA = plt.subplots(figsize=(8.5, 4.8))
data_by_year = [df.loc[df["year"] == y, rate_col].values for y in years_to_use]
# Use tick_labels (new matplotlib) + style median line explicitly
bp = axA.boxplot(
data_by_year,
tick_labels=[str(y) for y in years_to_use],
showfliers=False,
medianprops=dict(color="red", linewidth=2),
)
axA.set_title(f"Distribution of EB Risk Rate by Year — mode={MODE_ENV}", fontweight="bold")
axA.set_xlabel("Year")
axA.set_ylabel("EB Risk Rate (per 100k trips)")
axA.grid(alpha=0.3)
# Legend entry to explain the horizontal line is the median
median_handle = mlines.Line2D([], [], color="red", linewidth=2, label="Median")
axA.legend(handles=[median_handle], loc="best")
# # Optional: also add a small in-plot annotation (comment out if you prefer legend only)
# axA.text(
# 0.98, 0.98,
# "Red line = median",
# transform=axA.transAxes,
# ha="right", va="top",
# fontsize=10,
# bbox=dict(boxstyle="round,pad=0.25", facecolor="white", alpha=0.85, edgecolor="gray")
# )
savefig(f"yearly_distribution_boxplot_mode{MODE_ENV}.png")
plt.show()
# =========================================================
# Plot B: Station-level change (endpoints slopegraph)
# =========================================================
if id_col is None:
print("[yearly-plots] No start_station_id column; skipping station-level change plot.")
else:
# Aggregate within each year across selected months:
# exposure-weighted average EB rate (stable)
def _agg_one(grp: pd.DataFrame) -> pd.Series:
w = grp[exp_col].to_numpy()
x = grp[rate_col].to_numpy()
wsum = float(np.sum(w))
if wsum <= 0 or len(x) == 0:
return pd.Series({"exposure_sum": 0.0, "eb_rate_weighted": np.nan})
return pd.Series({"exposure_sum": wsum, "eb_rate_weighted": float(np.average(x, weights=w))})
g = (
df.groupby([id_col, "year"], as_index=False)
.apply(_agg_one)
.reset_index(drop=True)
)
y0, y1 = min(years_to_use), max(years_to_use)
gg0 = g[g["year"] == y0][[id_col, "exposure_sum", "eb_rate_weighted"]].rename(
columns={"exposure_sum": "exp0", "eb_rate_weighted": "rate0"}
)
gg1 = g[g["year"] == y1][[id_col, "exposure_sum", "eb_rate_weighted"]].rename(
columns={"exposure_sum": "exp1", "eb_rate_weighted": "rate1"}
)
merged = gg0.merge(gg1, on=id_col, how="inner").dropna(subset=["rate0", "rate1"]).copy()
if merged.empty:
print(f"[yearly-plots] No stations present in BOTH {y0} and {y1}; skipping slopegraph.")
else:
merged["exp_total"] = merged["exp0"] + merged["exp1"]
TOP_N = 120
merged = merged.sort_values("exp_total", ascending=False).head(TOP_N).copy()
figB, axB = plt.subplots(figsize=(8.5, 5.5))
x_positions = [0, 1]
for _, r in merged.iterrows():
axB.plot(
x_positions,
[r["rate0"], r["rate1"]],
alpha=0.25,
linewidth=1
)
# Median line for context
axB.plot(
x_positions,
[merged["rate0"].median(), merged["rate1"].median()],
linewidth=3,
label="Median (top exposure)"
)
axB.set_xticks(x_positions)
axB.set_xticklabels([str(y0), str(y1)])
axB.set_title(
f"Station-level EB Risk Change (Exposure-weighted) — months={months_to_use} — mode={MODE_ENV}",
fontweight="bold"
)
axB.set_ylabel("EB Risk Rate (per 100k trips)")
axB.grid(alpha=0.3)
axB.legend()
savefig(f"station_change_slopegraph_{y0}_to_{y1}_mode{MODE_ENV}.png")
plt.show()
[yearly-plots] MODE=nyc years=[2019, 2020, 2023, 2025] months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] rows=16,454
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/yearly_distribution_boxplot_modenyc.png
[yearly-plots] No stations present in BOTH 2019 and 2025; skipping slopegraph.
/tmp/ipykernel_7412/2327989950.py:130: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. .apply(_agg_one)
In [ ]:
Executive Risk Summary¶
In [6]:
# --- EXECUTIVE RISK SUMMARY (nbconvert-safe, self-contained) ---
from pathlib import Path
import pandas as pd
import numpy as np
MIN_TRIPS = 5000
# Make sure df_score exists (avoid NameError)
if "df_score" not in globals() or df_score is None or len(df_score) == 0:
print(" df_score is missing/empty — cannot build executive summary.")
else:
print("\n" + "="*70)
print("EXECUTIVE RISK SUMMARY FOR AXA")
print("="*70)
s = df_score.copy()
# Defensive: ensure credibility_flag exists
if "credibility_flag" not in s.columns:
if "exposure_trips" in s.columns:
s["credibility_flag"] = np.where(s["exposure_trips"] >= MIN_TRIPS, "credible", "insufficient_data")
elif "trips" in s.columns:
s["credibility_flag"] = np.where(s["trips"] >= MIN_TRIPS, "credible", "insufficient_data")
else:
s["credibility_flag"] = "unknown"
# Decide exposure column
exposure_col = "exposure_trips" if "exposure_trips" in s.columns else ("trips" if "trips" in s.columns else None)
if exposure_col is not None:
s[exposure_col] = pd.to_numeric(s[exposure_col], errors="coerce").fillna(0)
# Infer mode label safely
mode_label = "unknown"
if "mode" in s.columns and s["mode"].notna().any():
uniq_modes = sorted(s["mode"].astype(str).unique().tolist())
mode_label = uniq_modes[0] if len(uniq_modes) == 1 else "multiple"
# Build credible_stations HERE (fixes your NameError)
credible_stations = s[s["credibility_flag"].astype(str).str.lower() == "credible"].copy()
total_rows = len(s)
credible_count = len(credible_stations)
summary_data = []
summary_data.append({
"Metric": "Total station-period rows",
"Value": f"{total_rows:,}",
"Notes": f"mode={mode_label}"
})
if total_rows > 0:
pct_cred = 100.0 * credible_count / total_rows
summary_data.append({
"Metric": f"Credible rows (≥{MIN_TRIPS:,} trips)",
"Value": f"{credible_count:,} ({pct_cred:.1f}%)",
"Notes": "Enough volume to trust risk ranking"
})
# Optional: risk_tier counts (only if column exists)
if "risk_tier" in credible_stations.columns:
for tier, note in [
("High Risk", "Premium pricing / targeted prevention"),
("Medium Risk", "Standard pricing"),
("Low Risk", "Discount / acquisition-friendly"),
]:
n = int((credible_stations["risk_tier"].astype(str) == tier).sum())
pct = (100.0 * n / credible_count) if credible_count > 0 else 0.0
summary_data.append({
"Metric": f"{tier} rows",
"Value": f"{n:,} ({pct:.1f}%)",
"Notes": note
})
# Hotspots (only if columns exist) — and handle True/False safely
def _safe_true_count(df: pd.DataFrame, col: str) -> int:
if col not in df.columns:
return 0
v = df[col]
if v.dtype == bool:
return int(v.sum())
# tolerate strings like "True"/"False"
return int(v.astype(str).str.lower().isin(["true", "1", "yes"]).sum())
for col, label, note in [
("prevention_hotspot", "Prevention hotspots", "High exposure + high risk → safety pilots"),
("product_hotspot", "Product hotspots", "High exposure → strong sales reach"),
("acquisition_hotspot", "Acquisition hotspots", "High exposure + low risk → easy wins"),
]:
if col in s.columns:
n = _safe_true_count(s, col)
summary_data.append({
"Metric": label,
"Value": f"{n:,}",
"Notes": note
})
df_summary = pd.DataFrame(summary_data)
display(df_summary)
# Save summary (RUN_DIR safe)
out_dir = None
if "RUN_DIR" in globals() and RUN_DIR is not None:
out_dir = Path(RUN_DIR)
else:
out_dir = Path(".").resolve()
out_path = out_dir / "risk_executive_summary.csv"
df_summary.to_csv(out_path, index=False)
print(f"\nSaved executive summary to: {out_path}")
====================================================================== EXECUTIVE RISK SUMMARY FOR AXA ======================================================================
| Metric | Value | Notes | |
|---|---|---|---|
| 0 | Total station-period rows | 72,466 | mode=nyc |
| 1 | Credible rows (≥5,000 trips) | 16,454 (22.7%) | Enough volume to trust risk ranking |
| 2 | Prevention hotspots | 2,557 | High exposure + high risk → safety pilots |
| 3 | Product hotspots | 14,494 | High exposure → strong sales reach |
| 4 | Acquisition hotspots | 10,223 | High exposure + low risk → easy wins |
Saved executive summary to: /home/hamzeh-khanpour/Desktop/CITIBIK-main/summaries/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/risk_executive_summary.csv